package org.ykx.demo.sql

import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.dmg.pmml.True

object HiveDemo {
  
  val dfDirPath="hdfs://master:8020/tmp/original-data.csv"
  val conf = new SparkConf().setMaster ("local[*]").setAppName ("DataFrameTest")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  
  
  def main(args: Array[String]): Unit = {
    val rdd = sc.textFile(dfDirPath, 3)
 		val nrdd = rdd.map { line => line.split(",") }.map { x => (x.apply(0),x.apply(1),x.apply(2)) }
    println("RDD Count: "+rdd.count())
    println("RDD partions: "+rdd.partitions.length+" RDD partitioner: "+rdd.partitioner)
    println("NRDD partions: "+nrdd.partitions.length)
    
//    rdd.map { line => line.split(",") }.map { (_,1) }.reduceByKey(_+_).foreach(println)
    println("==========================================")
    val crdd = rdd.flatMap(_.split(",")).map((_,1)).reduceByKey(_+_)
    println("CRDD partions: "+crdd.partitions.length)
    
    
    sc.stop()
  }
}